# for dealing with file directories
import os
# for making the plots
import matplotlib.pyplot as plt
# for dealing with dataframes
import pandas as pd
# for parallel computation
from joblib import Parallel, delayed
# for k mean clustering
from sklearn.cluster import KMeans
# for converting text sequences to features
from sklearn.feature_extraction.text import CountVectorizer
# Set the your working directory
# os.chdir(r"/work/$your_group/$your_name/workshop_2022/CGC_clustering")
# example:
os.chdir(r"/work/yinlab/tangli/workshop_2022/CGC_clustering")
# Read dataframe
all_unsupervised = pd.read_csv("cgc_seq_substr_list.txt", sep="\t", header=None)
# Name columns for the dataframe
all_unsupervised.columns = [
"Genome_ID",
"CGC_ID",
"sig_gene_seq",
"low_level_substr",
"Species",
]
all_unsupervised.head()
| Genome_ID | CGC_ID | sig_gene_seq | low_level_substr | Species | |
|---|---|---|---|---|---|
| 0 | MGYG000000013 | MGYG000000013_1|CGC1 | 1.B.14,GH29,GH16,GH76 | galactomannan | Bacteroides sp902362375 |
| 1 | MGYG000000013 | MGYG000000013_1|CGC2 | 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 | alpha-mannan | Bacteroides sp902362375 |
| 2 | MGYG000000013 | MGYG000000013_1|CGC3 | GH18,8.A.46,1.B.14 | host glycan | Bacteroides sp902362375 |
| 3 | MGYG000000013 | MGYG000000013_1|CGC4 | GH13,GH97,1.B.14 | starch | Bacteroides sp902362375 |
| 4 | MGYG000000013 | MGYG000000013_1|CGC5 | GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... | lactose,host glycan | Bacteroides sp902362375 |
# View the shape of dataframe
all_unsupervised.shape
(3352, 5)
# There are a few duplicates in the dataframe
all_unsupervised["sig_gene_seq"].value_counts()
9.B.146,2.A.103,GT28 24
GT28,2.A.103,9.B.146 18
CBM50,3.A.5 17
1.B.14,8.A.46,GH18 16
3.A.5,CBM50 15
..
2.A.66,GH16_3 1
GH95,1.B.14,GH2,GH35,GH43_10,1.B.14 1
1.B.42,Aminotran_1_2,CBM67|GH78 1
3.A.3,3.A.3,3.A.3,2.A.21,GerE,FecR,1.B.14,GH115 1
1.A.43,GH3,GH158,GH16_3,1.B.14 1
Name: sig_gene_seq, Length: 2463, dtype: int64
# Show an example for duplicates
all_unsupervised[all_unsupervised["sig_gene_seq"] == "9.B.146,2.A.103,GT28"]
| Genome_ID | CGC_ID | sig_gene_seq | low_level_substr | Species | |
|---|---|---|---|---|---|
| 10 | MGYG000000013 | MGYG000000013_1|CGC11 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp902362375 |
| 266 | MGYG000000057 | MGYG000000057_3|CGC2 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp002491635 |
| 396 | MGYG000000105 | MGYG000000105_3|CGC2 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides clarus |
| 669 | MGYG000000224 | MGYG000000224_6|CGC5 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp003545565 |
| 707 | MGYG000000236 | MGYG000000236_3|CGC7 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides fragilis_A |
| 888 | MGYG000000652 | MGYG000000652_11|CGC1 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides togonis |
| 903 | MGYG000000675 | MGYG000000675_1|CGC3 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides congonensis |
| 1288 | MGYG000001345 | MGYG000001345_36|CGC11 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides xylanisolvens |
| 1537 | MGYG000001378 | MGYG000001378_14|CGC1 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides ovatus |
| 1586 | MGYG000001422 | MGYG000001422_2|CGC10 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides oleiciplenus |
| 1701 | MGYG000001433 | MGYG000001433_1|CGC39 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides salyersiae |
| 1759 | MGYG000001461 | MGYG000001461_1|CGC10 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides neonati |
| 1826 | MGYG000001661 | MGYG000001661_1|CGC8 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides gallinarum |
| 1927 | MGYG000002273 | MGYG000002273_9|CGC3 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides stercorirosoris |
| 2392 | MGYG000002549 | MGYG000002549_18|CGC7 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides caccae |
| 2566 | MGYG000003064 | MGYG000003064_3|CGC7 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides graminisolvens |
| 2714 | MGYG000003252 | MGYG000003252_545|CGC1 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp900761785 |
| 2739 | MGYG000003312 | MGYG000003312_4|CGC1 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp900547205 |
| 2922 | MGYG000003363 | MGYG000003363_45|CGC1 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp900766195 |
| 3004 | MGYG000003367 | MGYG000003367_48|CGC5 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp007896885 |
| 3157 | MGYG000004019 | MGYG000004019_6|CGC3 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides massiliensis_A |
| 3195 | MGYG000004185 | MGYG000004185_1|CGC6 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp900553815 |
| 3240 | MGYG000004676 | MGYG000004676_2|CGC3 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp900766005 |
| 3310 | MGYG000004899 | MGYG000004899_9|CGC2 | 9.B.146,2.A.103,GT28 | capsule polysaccharide | Bacteroides sp900555635 |
# Remove duplicates
all_unsupervised = all_unsupervised.drop_duplicates("sig_gene_seq")
all_unsupervised.shape
(2463, 5)
# Write out a csv file
#all_unsupervised.to_csv("unsupervised_cgc_data.tsv", index=False, sep="\t")
all_unsupervised.head()
| Genome_ID | CGC_ID | sig_gene_seq | low_level_substr | Species | |
|---|---|---|---|---|---|
| 0 | MGYG000000013 | MGYG000000013_1|CGC1 | 1.B.14,GH29,GH16,GH76 | galactomannan | Bacteroides sp902362375 |
| 1 | MGYG000000013 | MGYG000000013_1|CGC2 | 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 | alpha-mannan | Bacteroides sp902362375 |
| 2 | MGYG000000013 | MGYG000000013_1|CGC3 | GH18,8.A.46,1.B.14 | host glycan | Bacteroides sp902362375 |
| 3 | MGYG000000013 | MGYG000000013_1|CGC4 | GH13,GH97,1.B.14 | starch | Bacteroides sp902362375 |
| 4 | MGYG000000013 | MGYG000000013_1|CGC5 | GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... | lactose,host glycan | Bacteroides sp902362375 |
# Some low-level subratrates have multiple categories in one string
# removing those
all_unsupervised["lens"] = [
len(str(seq).split(",")) for seq in all_unsupervised["low_level_substr"]
]
# Keep only the clean ones (one subratrate)
all_unsupervised = all_unsupervised[all_unsupervised["lens"] == 1]
all_unsupervised
| Genome_ID | CGC_ID | sig_gene_seq | low_level_substr | Species | lens | |
|---|---|---|---|---|---|---|
| 0 | MGYG000000013 | MGYG000000013_1|CGC1 | 1.B.14,GH29,GH16,GH76 | galactomannan | Bacteroides sp902362375 | 1 |
| 1 | MGYG000000013 | MGYG000000013_1|CGC2 | 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 | alpha-mannan | Bacteroides sp902362375 | 1 |
| 2 | MGYG000000013 | MGYG000000013_1|CGC3 | GH18,8.A.46,1.B.14 | host glycan | Bacteroides sp902362375 | 1 |
| 3 | MGYG000000013 | MGYG000000013_1|CGC4 | GH13,GH97,1.B.14 | starch | Bacteroides sp902362375 | 1 |
| 5 | MGYG000000013 | MGYG000000013_1|CGC6 | 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... | arabinoxylan | Bacteroides sp902362375 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 3340 | MGYG000004899 | MGYG000004899_23|CGC1 | 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... | xylan | Bacteroides sp900555635 | 1 |
| 3342 | MGYG000004899 | MGYG000004899_28|CGC1 | GT0,GT4,9.B.18,GT0 | capsule polysaccharide | Bacteroides sp900555635 | 1 |
| 3343 | MGYG000004899 | MGYG000004899_30|CGC1 | 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... | beta-glucan | Bacteroides sp900555635 | 1 |
| 3344 | MGYG000004899 | MGYG000004899_31|CGC1 | 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... | host glycan | Bacteroides sp900555635 | 1 |
| 3345 | MGYG000004899 | MGYG000004899_32|CGC1 | PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... | rhamnogalacturonan | Bacteroides sp900555635 | 1 |
1931 rows × 6 columns
# all_unsupervised = pd.concat([all_unsupervised_clean], ignore_index = True)
# Count the number of cgc sequences for each subratrate
all_unsupervised["low_level_substr"].value_counts()
host glycan 346
capsule polysaccharide 201
rhamnogalacturonan 122
glycosaminoglycan 120
pectin 108
...
fucosyllactose 1
cellobiose 1
trehalose 1
sorbitol 1
beta-glucoside 1
Name: low_level_substr, Length: 64, dtype: int64
all_unsupervised["low_level_substr"]
0 galactomannan
1 alpha-mannan
2 host glycan
3 starch
5 arabinoxylan
...
3340 xylan
3342 capsule polysaccharide
3343 beta-glucan
3344 host glycan
3345 rhamnogalacturonan
Name: low_level_substr, Length: 1931, dtype: object
# Need to decide on delimeters
all_unsupervised[["sig_gene_seq"]].values[-1][0]
'PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67|GH78,HTH_AraC,GH106'
# Treat "|" as ","
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",")
'PL8_3,CBM67,GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67,GH78,HTH_AraC,GH106'
# Split the cgc sequences
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",").split(",")
['PL8_3', 'CBM67', 'GH78', 'GH28', 'GH88', 'GH2', 'PL8_3', '8.A.46', '1.B.14', 'CBM67', 'GH78', 'HTH_AraC', 'GH106']
all_unsupervised.head()
| Genome_ID | CGC_ID | sig_gene_seq | low_level_substr | Species | lens | |
|---|---|---|---|---|---|---|
| 0 | MGYG000000013 | MGYG000000013_1|CGC1 | 1.B.14,GH29,GH16,GH76 | galactomannan | Bacteroides sp902362375 | 1 |
| 1 | MGYG000000013 | MGYG000000013_1|CGC2 | 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 | alpha-mannan | Bacteroides sp902362375 | 1 |
| 2 | MGYG000000013 | MGYG000000013_1|CGC3 | GH18,8.A.46,1.B.14 | host glycan | Bacteroides sp902362375 | 1 |
| 3 | MGYG000000013 | MGYG000000013_1|CGC4 | GH13,GH97,1.B.14 | starch | Bacteroides sp902362375 | 1 |
| 5 | MGYG000000013 | MGYG000000013_1|CGC6 | 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... | arabinoxylan | Bacteroides sp902362375 | 1 |
# check if there is missing value
all_unsupervised.isnull().sum()
# Drop missing values if possible
# all_unsupervised = all_unsupervised.dropna()
Genome_ID 0 CGC_ID 0 sig_gene_seq 0 low_level_substr 0 Species 0 lens 0 dtype: int64
# Only keep subratrate classes which have more than 2 occurrences
count_df = pd.DataFrame(
all_unsupervised["low_level_substr"].value_counts()[
all_unsupervised["low_level_substr"].value_counts() >= 2
]
).reset_index()
# Get their names
to_keep_substrates = count_df["index"].values
to_keep_substrates
array(['host glycan', 'capsule polysaccharide', 'rhamnogalacturonan',
'glycosaminoglycan', 'pectin', 'alpha-mannan', 'N-glycan',
'homogalacturonan', 'xylan', 'arabinoxylan', 'porphyran',
'hemicellulose', 'mucin', 'acetylated glucuronoxylan',
'beta-glucan', ' ', 'plant polysaccharide', 'xyloglucan',
'arabinogalactan', 'fructan', 'O-antigen', 'galactomannan',
'exopolysaccharide', 'sialoglycoconjugate', 'dextran', 'starch',
'arabinan', 'cellulose', 'glycogen', 'carrageenan', 'alginate',
'maltose', 'beta-mannan', 'O-glycan', 'pullulan', 'mannose',
'alpha-glucan', 'galactan', 'sialic acid', 'ribose', 'chitin',
'xanthan', 'arabinose', 'maltooligosaccharide',
'4-methylumbelliferyl 6-azido-6-deoxy-beta-D-galactoside',
'acarbose', 'glucomannan', 'glucose'], dtype=object)
# Subset to only those that are more than 2
all_unsupervised = all_unsupervised[
all_unsupervised["low_level_substr"].isin(to_keep_substrates)
]
all_unsupervised
| Genome_ID | CGC_ID | sig_gene_seq | low_level_substr | Species | lens | |
|---|---|---|---|---|---|---|
| 0 | MGYG000000013 | MGYG000000013_1|CGC1 | 1.B.14,GH29,GH16,GH76 | galactomannan | Bacteroides sp902362375 | 1 |
| 1 | MGYG000000013 | MGYG000000013_1|CGC2 | 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 | alpha-mannan | Bacteroides sp902362375 | 1 |
| 2 | MGYG000000013 | MGYG000000013_1|CGC3 | GH18,8.A.46,1.B.14 | host glycan | Bacteroides sp902362375 | 1 |
| 3 | MGYG000000013 | MGYG000000013_1|CGC4 | GH13,GH97,1.B.14 | starch | Bacteroides sp902362375 | 1 |
| 5 | MGYG000000013 | MGYG000000013_1|CGC6 | 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... | arabinoxylan | Bacteroides sp902362375 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 3340 | MGYG000004899 | MGYG000004899_23|CGC1 | 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... | xylan | Bacteroides sp900555635 | 1 |
| 3342 | MGYG000004899 | MGYG000004899_28|CGC1 | GT0,GT4,9.B.18,GT0 | capsule polysaccharide | Bacteroides sp900555635 | 1 |
| 3343 | MGYG000004899 | MGYG000004899_30|CGC1 | 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... | beta-glucan | Bacteroides sp900555635 | 1 |
| 3344 | MGYG000004899 | MGYG000004899_31|CGC1 | 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... | host glycan | Bacteroides sp900555635 | 1 |
| 3345 | MGYG000004899 | MGYG000004899_32|CGC1 | PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... | rhamnogalacturonan | Bacteroides sp900555635 | 1 |
1915 rows × 6 columns
# view all cgc sequences in the subset
all_unsupervised["sig_gene_seq"]
0 1.B.14,GH29,GH16,GH76
1 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92
2 GH18,8.A.46,1.B.14
3 GH13,GH97,1.B.14
5 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_...
...
3340 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_...
3342 GT0,GT4,9.B.18,GT0
3343 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2...
3344 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8....
3345 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1....
Name: sig_gene_seq, Length: 1915, dtype: object
# Write selected out a csv file
#all_unsupervised.to_csv("unsupervised_cgc_selected.tsv", index=False, sep="\t")
# Train test split - Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
# Train test split
# Split original data into 70% for clustering and 30% for finding the mappings
X_train, X_test, y_train, y_test = train_test_split(
all_unsupervised["sig_gene_seq"],
all_unsupervised["low_level_substr"],
test_size=0.3,
stratify=all_unsupervised["low_level_substr"].values,
)
## X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=array-like)
## test_size: the proportion of the dataset to include in the train split
## stratify: If not None, data is split in a stratified fashion, using this as the class labels.
# Supervised is for finding the mappings
X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
X_supervised
/tmp/ipykernel_3083247/4208520920.py:2: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
| sig_gene_seq | low_level_substr | |
|---|---|---|
| 0 | GT27,3.A.1 | hemicellulose |
| 1 | 1.B.14,8.A.46,GH33,GH20,GH2,GH20,GH20,1.B.14,8... | sialoglycoconjugate |
| 2 | 1.B.14,8.A.46,GH18,GH29,1.B.14 | N-glycan |
| 3 | GT5,GH57,GT4,GH133,9.B.104,2.A.124,GH3,GH3,CE1... | pectin |
| 4 | 1.B.14,1.B.14,GH88 | host glycan |
| ... | ... | ... |
| 570 | CBM62|CBM62,TPR_1,2.A.16 | 4-methylumbelliferyl 6-azido-6-deoxy-beta-D-ga... |
| 571 | 1.C.105,2.A.49,PL15_2 | glycosaminoglycan |
| 572 | 1.B.14,GH89,2.A.8,GH89 | host glycan |
| 573 | GH95,2.A.51,2.A.51,GH20 | sialoglycoconjugate |
| 574 | 1.B.14,CBM32,1.B.14 | host glycan |
575 rows × 2 columns
# View data for clustering
X_train
361 GH95,4.B.1,1.B.14
1143 GH32,1.B.14,1.A.23,2.A.19
946 1.B.14,FecR,Sigma70_r2|Sigma70_r4_2,GH110,FecR...
546 2.A.1,GH165,1.B.14
2826 9.B.171,PL15_2,HTH_AraC,GH88
...
3268 GH28,GH92,GH89,HTH_AraC,CBM32|GH13_36,1.B.14
2947 GT26,GT32,2.A.66
3005 GH18|CE4|GT2,3.A.1,1.C.39,GT4,GT2,GT2,GT9,4.D.1
2086 PL29,1.A.34,MerR,HD,GH23
2411 PL8_2,GH117|GH117,HTH_3,8.A.46,1.B.14
Name: sig_gene_seq, Length: 1340, dtype: object
# Countvectorizer
# To convert a collection of text documents to a vector of term/token counts
count_vectorizer = CountVectorizer(
tokenizer=lambda x: str(x).replace("|", ",").split(","), lowercase=False
)
# Fit the count vectorizer (learn a vocabulary dictionary of all tokens in the raw documents)
count_vectorizer.fit(X_train)
/util/opt/anaconda/deployed-conda-envs/packages/biopython/envs/biopython-1.79-py39/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:489: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
warnings.warn("The parameter 'token_pattern' will not be used"
CountVectorizer(lowercase=False,
tokenizer=<function <lambda> at 0x151e86752a60>)
# Transform texts to sequences
count_vectorized_array = count_vectorizer.transform(X_train)
count_vectorized_array
# count_vectorized_array.toarray()
<1340x420 sparse matrix of type '<class 'numpy.int64'>' with 7089 stored elements in Compressed Sparse Row format>
# vocabulary - dictionary containing the word (key) and vector (value)
vocabulary = count_vectorizer.vocabulary_
vocabulary
# View
first20vocabulary = {k: vocabulary[k] for k in sorted(vocabulary.keys())[:20]}
first20vocabulary
{'1.A.1': 0,
'1.A.11': 1,
'1.A.13': 2,
'1.A.22': 3,
'1.A.23': 4,
'1.A.26': 5,
'1.A.28': 6,
'1.A.30': 7,
'1.A.33': 8,
'1.A.34': 9,
'1.A.35': 10,
'1.A.62': 11,
'1.A.78': 12,
'1.A.8': 13,
'1.B.14': 14,
'1.B.17': 15,
'1.B.18': 16,
'1.B.42': 17,
'1.B.44': 18,
'1.B.5': 19}
# Inverse vocabulary
inv_map_vocab = {v: k for k, v in vocabulary.items()}
# View
first20_inv_vocabulary = {
k: inv_map_vocab[k] for k in sorted(inv_map_vocab.keys())[:20]
}
first20_inv_vocabulary
{0: '1.A.1',
1: '1.A.11',
2: '1.A.13',
3: '1.A.22',
4: '1.A.23',
5: '1.A.26',
6: '1.A.28',
7: '1.A.30',
8: '1.A.33',
9: '1.A.34',
10: '1.A.35',
11: '1.A.62',
12: '1.A.78',
13: '1.A.8',
14: '1.B.14',
15: '1.B.17',
16: '1.B.18',
17: '1.B.42',
18: '1.B.44',
19: '1.B.5'}
# Affinity Propagation - a machine learning algorithm that can find data centers or clusters
# by sending messages between pairs of data points.
# does not need to know number of clusters
from sklearn.cluster import AffinityPropagation
# Fit the clustering algorithm on the unsupervised data
clustering = AffinityPropagation(random_state=5).fit(count_vectorized_array)
# random_state: pseudo-random number generator to control the starting state.
clustering.labels_
array([ 71, 33, 136, ..., 82, 58, 18])
# For counting
# Counter: it is a collection where elements are stored as dictionary keys
# and their counts are stored as dictionary values.
from collections import Counter
# Count for number of clusters
cluster_freq_info = pd.DataFrame(Counter(clustering.labels_), index=[0]).T.reset_index()
# T: Transpose index and columns
# view
cluster_freq_info.shape
(156, 2)
# Name columns and view
cluster_freq_info.columns = ["cluster_id", "number of samples"]
cluster_freq_info.head()
| cluster_id | number of samples | |
|---|---|---|
| 0 | 71 | 21 |
| 1 | 33 | 29 |
| 2 | 136 | 4 |
| 3 | 91 | 20 |
| 4 | 137 | 4 |
# Select the top clusters for visualization
cluster_freq_info = cluster_freq_info.sort_values("number of samples", ascending=False)
cluster_freq_info
| cluster_id | number of samples | |
|---|---|---|
| 7 | 134 | 67 |
| 49 | 38 | 54 |
| 24 | 57 | 46 |
| 10 | 149 | 38 |
| 1 | 33 | 29 |
| ... | ... | ... |
| 127 | 63 | 1 |
| 100 | 31 | 1 |
| 129 | 66 | 1 |
| 61 | 15 | 1 |
| 155 | 153 | 1 |
156 rows × 2 columns
# We will create clusters to substrate mappings,
# convert strings to vector,
# pass the supervised strings through the count vectorizer
supervised_seqs_array = count_vectorizer.transform(X_supervised["sig_gene_seq"].values)
# supervised_seqs_array
supervised_seqs_array_dense = supervised_seqs_array.todense()
supervised_seqs_array_dense
matrix([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]])
# Use the fit cluster object to predict clusters on the supervised set
supervised_set_clusters = clustering.predict(supervised_seqs_array_dense)
# Start to create the mapping
mapping_df = pd.DataFrame(X_supervised["low_level_substr"].values)
# Put the predicted clusters
mapping_df["cluster_id"] = supervised_set_clusters
# Sort and name the columns
mapping_df = mapping_df.sort_values("cluster_id")
mapping_df.columns = ["low_level_substr", "cluster_id"]
mapping_df
| low_level_substr | cluster_id | |
|---|---|---|
| 118 | host glycan | 0 |
| 410 | N-glycan | 0 |
| 413 | capsule polysaccharide | 0 |
| 328 | alginate | 0 |
| 497 | exopolysaccharide | 0 |
| ... | ... | ... |
| 126 | exopolysaccharide | 155 |
| 81 | exopolysaccharide | 155 |
| 252 | dextran | 155 |
| 238 | arabinoxylan | 155 |
| 150 | host glycan | 155 |
575 rows × 2 columns
# Aggregate substrate by using the cluster id
mapping_df = pd.DataFrame(
mapping_df.groupby("cluster_id")["low_level_substr"].aggregate(pd.Series.mode)
).reset_index()
# Remove [] from low_level_substr column
catch = []
for seq in mapping_df["low_level_substr"].values:
seq = str(seq).replace("]", "").replace("[", "")
catch.append(seq)
mapping_df["low_level_substr"] = catch
mapping_df
| cluster_id | low_level_substr | |
|---|---|---|
| 0 | 0 | host glycan |
| 1 | 1 | beta-mannan |
| 2 | 2 | host glycan |
| 3 | 3 | arabinan |
| 4 | 4 | host glycan |
| ... | ... | ... |
| 114 | 149 | alpha-mannan |
| 115 | 151 | capsule polysaccharide |
| 116 | 152 | host glycan |
| 117 | 154 | capsule polysaccharide |
| 118 | 155 | exopolysaccharide |
119 rows × 2 columns
# TSNE for visualization
# T-distributed Stochastic Neighbor Embedding - visualize high-dimensional data.
from sklearn.manifold import TSNE
# Covert sparse matrix
count_vectorized_array
<1340x420 sparse matrix of type '<class 'numpy.int64'>' with 7089 stored elements in Compressed Sparse Row format>
# 3D plot
X_embedded = TSNE(
n_components=3, init="pca" # learning_rate='auto',init='random'
).fit_transform(count_vectorized_array.toarray())
# n_components: dimension of the embedded space.
# init: initialization of embedding.
# fit_transform: fit X into an embedded space and return that transformed output.
# View dataframe
X_embedded_df = pd.DataFrame(X_embedded)
X_embedded_df
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | 6.559994 | -1.155289 | -4.756688 |
| 1 | -33.427830 | -8.903884 | 45.734745 |
| 2 | 21.595331 | 50.405891 | 11.839166 |
| 3 | 9.262816 | 21.649462 | -19.761250 |
| 4 | -13.136441 | 32.683598 | -59.671074 |
| ... | ... | ... | ... |
| 1335 | 26.730934 | -46.706142 | -48.666470 |
| 1336 | -38.275902 | 19.213551 | 2.894465 |
| 1337 | -64.072372 | 7.265289 | -35.391850 |
| 1338 | -21.172161 | 49.175171 | 39.198654 |
| 1339 | 46.532822 | -42.846436 | -32.027657 |
1340 rows × 3 columns
# Add cluster_id column to dataframe
X_embedded_df["cluster_id"] = clustering.labels_
X_embedded_df
| 0 | 1 | 2 | cluster_id | |
|---|---|---|---|---|
| 0 | 6.559994 | -1.155289 | -4.756688 | 71 |
| 1 | -33.427830 | -8.903884 | 45.734745 | 33 |
| 2 | 21.595331 | 50.405891 | 11.839166 | 136 |
| 3 | 9.262816 | 21.649462 | -19.761250 | 91 |
| 4 | -13.136441 | 32.683598 | -59.671074 | 137 |
| ... | ... | ... | ... | ... |
| 1335 | 26.730934 | -46.706142 | -48.666470 | 149 |
| 1336 | -38.275902 | 19.213551 | 2.894465 | 77 |
| 1337 | -64.072372 | 7.265289 | -35.391850 | 82 |
| 1338 | -21.172161 | 49.175171 | 39.198654 | 58 |
| 1339 | 46.532822 | -42.846436 | -32.027657 | 18 |
1340 rows × 4 columns
# Merge mapping_df with X_embedded_df by cluster_id
X_embedded_df = X_embedded_df.merge(mapping_df, how="left", on="cluster_id")
X_embedded_df
| 0 | 1 | 2 | cluster_id | low_level_substr | |
|---|---|---|---|---|---|
| 0 | 6.559994 | -1.155289 | -4.756688 | 71 | 'arabinoxylan' 'glycosaminoglycan' 'host glycan' |
| 1 | -33.427830 | -8.903884 | 45.734745 | 33 | host glycan |
| 2 | 21.595331 | 50.405891 | 11.839166 | 136 | host glycan |
| 3 | 9.262816 | 21.649462 | -19.761250 | 91 | 'exopolysaccharide' 'host glycan' |
| 4 | -13.136441 | 32.683598 | -59.671074 | 137 | glycosaminoglycan |
| ... | ... | ... | ... | ... | ... |
| 1335 | 26.730934 | -46.706142 | -48.666470 | 149 | alpha-mannan |
| 1336 | -38.275902 | 19.213551 | 2.894465 | 77 | 'capsule polysaccharide' 'carrageenan' |
| 1337 | -64.072372 | 7.265289 | -35.391850 | 82 | capsule polysaccharide |
| 1338 | -21.172161 | 49.175171 | 39.198654 | 58 | alginate |
| 1339 | 46.532822 | -42.846436 | -32.027657 | 18 | glycosaminoglycan |
1340 rows × 5 columns
# View top 10 substrates
X_embedded_df["low_level_substr"].value_counts().head(n=10)
host glycan 193 capsule polysaccharide 111 rhamnogalacturonan 106 glycosaminoglycan 105 alpha-mannan 72 homogalacturonan 55 beta-glucan 54 'N-glycan' 'host glycan' 46 acetylated glucuronoxylan 36 xylan 29 Name: low_level_substr, dtype: int64
# Select top 10 substrates
top_10_classes = X_embedded_df["low_level_substr"].value_counts()[:10]
top_10_classes
host glycan 193 capsule polysaccharide 111 rhamnogalacturonan 106 glycosaminoglycan 105 alpha-mannan 72 homogalacturonan 55 beta-glucan 54 'N-glycan' 'host glycan' 46 acetylated glucuronoxylan 36 xylan 29 Name: low_level_substr, dtype: int64
# Make top 10 substrates to list
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)
top_10_classes_list
['host glycan', 'capsule polysaccharide', 'rhamnogalacturonan', 'glycosaminoglycan', 'alpha-mannan', 'homogalacturonan', 'beta-glucan', "'N-glycan' 'host glycan'", 'acetylated glucuronoxylan', 'xylan']
# Subset X_embedded_df dataframe by list
X_embedded_df_viz = X_embedded_df[
X_embedded_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_df_viz
| 0 | 1 | 2 | cluster_id | low_level_substr | |
|---|---|---|---|---|---|
| 1 | -33.427830 | -8.903884 | 45.734745 | 33 | host glycan |
| 2 | 21.595331 | 50.405891 | 11.839166 | 136 | host glycan |
| 4 | -13.136441 | 32.683598 | -59.671074 | 137 | glycosaminoglycan |
| 5 | -23.056034 | -4.087102 | 29.942402 | 33 | host glycan |
| 7 | -65.945145 | 30.958498 | 13.941587 | 151 | capsule polysaccharide |
| ... | ... | ... | ... | ... | ... |
| 1330 | -53.821823 | 24.164757 | -6.134601 | 154 | capsule polysaccharide |
| 1334 | 42.950474 | -31.631836 | -29.714808 | 99 | glycosaminoglycan |
| 1335 | 26.730934 | -46.706142 | -48.666470 | 149 | alpha-mannan |
| 1337 | -64.072372 | 7.265289 | -35.391850 | 82 | capsule polysaccharide |
| 1339 | 46.532822 | -42.846436 | -32.027657 | 18 | glycosaminoglycan |
807 rows × 5 columns
# astype:change a pandas object to a specified dtype
X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
/tmp/ipykernel_3083247/3803832902.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
# plotly - an interactive, open-source, and browser-based graphing library
import plotly.express as px
fig = px.scatter_3d(X_embedded_df_viz, x=0, y=1, z=2, color="low_level_substr")
#fig.write_html("workshop_clusters.html")
# view plot
fig.show()
# view X_embedded_df dataframe
X_embedded_df.shape
(1340, 5)
# Creat a new dataframe - summarize cgc sequences, cluster ids and substrates
final_df_cgc_cluster_substrate = X_embedded_df[["low_level_substr", "cluster_id"]]
final_df_cgc_cluster_substrate["sig_gene_seq"] = X_train.values
final_df_cgc_cluster_substrate["sig_gene_seq"]
/tmp/ipykernel_3083247/3070682039.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
0 GH95,4.B.1,1.B.14
1 GH32,1.B.14,1.A.23,2.A.19
2 1.B.14,FecR,Sigma70_r2|Sigma70_r4_2,GH110,FecR...
3 2.A.1,GH165,1.B.14
4 9.B.171,PL15_2,HTH_AraC,GH88
...
1335 GH28,GH92,GH89,HTH_AraC,CBM32|GH13_36,1.B.14
1336 GT26,GT32,2.A.66
1337 GH18|CE4|GT2,3.A.1,1.C.39,GT4,GT2,GT2,GT9,4.D.1
1338 PL29,1.A.34,MerR,HD,GH23
1339 PL8_2,GH117|GH117,HTH_3,8.A.46,1.B.14
Name: sig_gene_seq, Length: 1340, dtype: object
# Name the columns
final_df_cgc_cluster_substrate[["sig_gene_seq", "cluster_id", "low_level_substr"]]
| sig_gene_seq | cluster_id | low_level_substr | |
|---|---|---|---|
| 0 | GH95,4.B.1,1.B.14 | 71 | 'arabinoxylan' 'glycosaminoglycan' 'host glycan' |
| 1 | GH32,1.B.14,1.A.23,2.A.19 | 33 | host glycan |
| 2 | 1.B.14,FecR,Sigma70_r2|Sigma70_r4_2,GH110,FecR... | 136 | host glycan |
| 3 | 2.A.1,GH165,1.B.14 | 91 | 'exopolysaccharide' 'host glycan' |
| 4 | 9.B.171,PL15_2,HTH_AraC,GH88 | 137 | glycosaminoglycan |
| ... | ... | ... | ... |
| 1335 | GH28,GH92,GH89,HTH_AraC,CBM32|GH13_36,1.B.14 | 149 | alpha-mannan |
| 1336 | GT26,GT32,2.A.66 | 77 | 'capsule polysaccharide' 'carrageenan' |
| 1337 | GH18|CE4|GT2,3.A.1,1.C.39,GT4,GT2,GT2,GT9,4.D.1 | 82 | capsule polysaccharide |
| 1338 | PL29,1.A.34,MerR,HD,GH23 | 58 | alginate |
| 1339 | PL8_2,GH117|GH117,HTH_3,8.A.46,1.B.14 | 18 | glycosaminoglycan |
1340 rows × 3 columns
# View substrate by using cluster id
mapping_df[mapping_df["cluster_id"] == 0]
| cluster_id | low_level_substr | |
|---|---|---|
| 0 | 0 | host glycan |
# View number of sequences by using cluster id
cluster_freq_info[cluster_freq_info["cluster_id"] == 68]
| cluster_id | number of samples | |
|---|---|---|
| 25 | 68 | 3 |
final_df_cgc_cluster_substrate["cluster_id"].value_counts()[:10]
134 67 38 54 57 46 149 38 33 29 61 28 92 27 62 27 14 26 60 24 Name: cluster_id, dtype: int64
# Write out a csv file
#final_df_cgc_cluster_substrate.to_csv("cgc_cluster_substrate.tsv", index=False, sep="\t")
# 2D plot
X_embedded_2D = TSNE(n_components=2, init="pca").fit_transform(
count_vectorized_array.toarray()
)
X_embedded_2D_df = pd.DataFrame(X_embedded_2D)
X_embedded_2D_df["cluster_id"] = clustering.labels_
X_embedded_2D_df = X_embedded_2D_df.merge(mapping_df, how="left", on="cluster_id")
X_embedded_2D_df
| 0 | 1 | cluster_id | low_level_substr | |
|---|---|---|---|---|
| 0 | 5.919950 | 5.166234 | 71 | 'arabinoxylan' 'glycosaminoglycan' 'host glycan' |
| 1 | -6.791799 | 6.786656 | 33 | host glycan |
| 2 | 8.243697 | 27.787916 | 136 | host glycan |
| 3 | 6.899367 | 11.483501 | 91 | 'exopolysaccharide' 'host glycan' |
| 4 | -28.028196 | 17.804018 | 137 | glycosaminoglycan |
| ... | ... | ... | ... | ... |
| 1335 | 30.564066 | 21.215761 | 149 | alpha-mannan |
| 1336 | -37.680904 | 6.757950 | 77 | 'capsule polysaccharide' 'carrageenan' |
| 1337 | -49.747913 | 23.629011 | 82 | capsule polysaccharide |
| 1338 | -31.859072 | 23.341862 | 58 | alginate |
| 1339 | 47.376091 | -6.875403 | 18 | glycosaminoglycan |
1340 rows × 4 columns
top_10_classes = X_embedded_2D_df["low_level_substr"].value_counts()[:10]
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)
X_embedded_2D_df_viz = X_embedded_df[
X_embedded_2D_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_2D_df_viz["low_level_substr"] = X_embedded_2D_df_viz[
"low_level_substr"
].astype(str)
fig = px.scatter(X_embedded_2D_df_viz, x=0, y=1, color="low_level_substr")
#fig.write_html("workshop_clusters_2D_10top.html")
fig.show()
/tmp/ipykernel_3083247/2587830699.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy